{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- movieId: integer (nullable = true)\n",
      " |-- rating: double (nullable = true)\n",
      " |-- userId: integer (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from pyspark.sql import SparkSession\n",
    "from pyspark.ml.recommendation import ALS\n",
    "from pyspark.ml.evaluation import RegressionEvaluator\n",
    "\n",
    "spark = SparkSession.builder.appName('recommender').getOrCreate()\n",
    "df = spark.read.csv('movielens_ratings.csv', inferSchema= True, header = True)\n",
    "df.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+------+------+\n",
      "|movieId|rating|userId|\n",
      "+-------+------+------+\n",
      "|      2|   3.0|     0|\n",
      "|      3|   1.0|     0|\n",
      "|      5|   2.0|     0|\n",
      "+-------+------+------+\n",
      "only showing top 3 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.show(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+------------------+------------------+------------------+\n",
      "|summary|           movieId|            rating|            userId|\n",
      "+-------+------------------+------------------+------------------+\n",
      "|  count|              1501|              1501|              1501|\n",
      "|   mean| 49.40572951365756|1.7741505662891406|14.383744170552964|\n",
      "| stddev|28.937034065088994| 1.187276166124803| 8.591040424293272|\n",
      "|    min|                 0|               1.0|                 0|\n",
      "|    max|                99|               5.0|                29|\n",
      "+-------+------------------+------------------+------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.describe().show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "train, test = df.randomSplit([0.8, 0.2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "als = ALS(maxIter=5, regParam=0.01, userCol='userId', itemCol='movieId', ratingCol='rating')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+------+------+-----------+\n",
      "|movieId|rating|userId| prediction|\n",
      "+-------+------+------+-----------+\n",
      "|     31|   1.0|    26| -2.5238004|\n",
      "|     31|   1.0|    27|-0.59501255|\n",
      "|     31|   1.0|     4|   3.137197|\n",
      "|     85|   1.0|    28| -0.1683234|\n",
      "|     85|   1.0|    13|  2.2037606|\n",
      "|     85|   5.0|     8|   4.343044|\n",
      "|     85|   1.0|    29|  1.5260103|\n",
      "|     65|   1.0|    28|  3.4493313|\n",
      "|     53|   3.0|    13|   2.631197|\n",
      "|     53|   1.0|    25| -2.3101962|\n",
      "|     78|   1.0|    13| 0.54879403|\n",
      "|     78|   1.0|    11|  0.4418241|\n",
      "|     81|   5.0|    28|  0.8307642|\n",
      "|     81|   1.0|     1| -1.0092545|\n",
      "|     81|   1.0|     6|  2.4090357|\n",
      "|     81|   1.0|    19| 0.13363218|\n",
      "|     81|   1.0|    15|  0.5015665|\n",
      "|     28|   1.0|    23| -0.2624761|\n",
      "|     28|   1.0|     2|  1.4344041|\n",
      "|     76|   1.0|     1|  1.9119977|\n",
      "+-------+------+------+-----------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "model = als.fit(train)\n",
    "predictions = model.transform(test)\n",
    "predictions.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "RMSE: 1.8124486699552562\n"
     ]
    }
   ],
   "source": [
    "evaluator = RegressionEvaluator(metricName = 'rmse', labelCol = 'rating', predictionCol = 'prediction')\n",
    "rmse = evaluator.evaluate(predictions)\n",
    "print('RMSE:', rmse)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------+-------+\n",
      "|userId|movieId|\n",
      "+------+-------+\n",
      "|    12|      4|\n",
      "|    12|     18|\n",
      "|    12|     22|\n",
      "|    12|     35|\n",
      "|    12|     38|\n",
      "|    12|     41|\n",
      "|    12|     45|\n",
      "|    12|     63|\n",
      "|    12|     79|\n",
      "|    12|     83|\n",
      "|    12|     95|\n",
      "|    12|     96|\n",
      "+------+-------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "this_user = test.filter(test['userId'] == 12).select('userId', 'movieId')\n",
    "this_user.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------+-------+----------+\n",
      "|userId|movieId|prediction|\n",
      "+------+-------+----------+\n",
      "|    12|     22| 1.6517887|\n",
      "|    12|     96| 0.1308065|\n",
      "|    12|     41| 1.4067035|\n",
      "|    12|     35| 0.7640405|\n",
      "|    12|      4|-1.1053085|\n",
      "|    12|     63|  3.851338|\n",
      "|    12|     45|0.70455414|\n",
      "|    12|     38| 2.8361285|\n",
      "|    12|     95| 0.9426958|\n",
      "|    12|     83| 0.6145076|\n",
      "|    12|     79| 1.3491223|\n",
      "|    12|     18| -0.656619|\n",
      "+------+-------+----------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "recommendation_this_user = model.transform(this_user)\n",
    "recommendation_this_user.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------+-------+----------+\n",
      "|userId|movieId|prediction|\n",
      "+------+-------+----------+\n",
      "|    12|     63|  3.851338|\n",
      "|    12|     38| 2.8361285|\n",
      "|    12|     22| 1.6517887|\n",
      "|    12|     41| 1.4067035|\n",
      "|    12|     79| 1.3491223|\n",
      "|    12|     95| 0.9426958|\n",
      "|    12|     35| 0.7640405|\n",
      "|    12|     45|0.70455414|\n",
      "|    12|     83| 0.6145076|\n",
      "|    12|     96| 0.1308065|\n",
      "|    12|     18| -0.656619|\n",
      "|    12|      4|-1.1053085|\n",
      "+------+-------+----------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "recommendation_this_user.orderBy('prediction', ascending=False).show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "conda_python3",
   "language": "python",
   "name": "conda_python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
